#!/usr/bin/env python 3
# -*- coding: utf-8 -*-

#
# Copyright (c) 2022 PanXu, Inc. All Rights Reserved
#
"""
brief

Authors: PanXu
Date:    2022/10/08 21:07:00
"""


from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vec = TfidfVectorizer()   # token_pattern = "(?u)\b\w\w+\b"

texts = ["我 爱 中国",
         "中国 山川"]

vecs = tfidf_vec.fit_transform(texts)

print("向量", vecs)

print("feature", tfidf_vec.get_feature_names())


print("解决单字被过滤的问题---------------")
tfidf_vec = TfidfVectorizer(token_pattern=r"(?u)\b\w+\b")

vecs = tfidf_vec.fit_transform(texts)

print("向量", vecs)

print("feature", tfidf_vec.get_feature_names())


